library(ggplot2)
library(data.table)
library(plotly, quietly = T)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
查看商家的基本统计情况:
shop_info <- read.csv("~/tianchi2017/dataset/shop_info.txt", header=FALSE)
colnames(shop_info) = c("shop_id", "city_name", "location_id", "per_pay", "score", "comment_cnt", "shop_level", "cate_1_name", "cate_2_name", "cate_3_name")
shop_info[is.na(shop_info)] = 0
summary(shop_info)
## shop_id city_name location_id per_pay
## Min. : 1.0 上海 :285 Min. : 1.0 Min. : 1.00
## 1st Qu.: 500.8 杭州 :225 1st Qu.: 287.8 1st Qu.: 5.00
## Median :1000.5 北京 :163 Median : 577.5 Median :10.00
## Mean :1000.5 广州 :136 Mean : 583.1 Mean :10.48
## 3rd Qu.:1500.2 南京 :130 3rd Qu.: 877.2 3rd Qu.:15.00
## Max. :2000.0 武汉 :124 Max. :1159.0 Max. :20.00
## (Other):937
## score comment_cnt shop_level cate_1_name
## Min. :0.000 Min. : 0.000 Min. :0.0000 超市便利店 : 579
## 1st Qu.:1.000 1st Qu.: 0.000 1st Qu.:0.0000 购物 : 1
## Median :3.000 Median : 2.000 Median :1.0000 美发/美容/美甲: 1
## Mean :2.288 Mean : 2.675 Mean :0.8145 美食 :1415
## 3rd Qu.:3.000 3rd Qu.: 4.000 3rd Qu.:2.0000 休闲娱乐 : 2
## Max. :4.000 Max. :20.000 Max. :2.0000 医疗健康 : 2
##
## cate_2_name cate_3_name
## 快餐 :639 :585
## 超市 :372 西式快餐:405
## 便利店 :206 中式快餐:220
## 休闲茶饮:177 生鲜水果:111
## 小吃 :156 奶茶 : 92
## 休闲食品:150 其它小吃: 87
## (Other) :300 (Other) :500
# 画图
ggplot(data=shop_info, aes(per_pay))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=shop_info, aes(score))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=shop_info, aes(comment_cnt))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=shop_info, aes(x=factor(score),y=per_pay, fill=factor(score)))+geom_violin()

ggplot(data=shop_info, aes(x=factor(score),y=comment_cnt, fill=factor(score)))+geom_violin()

查看用户浏览的基本情况:
user_view <- read.csv("~/tianchi2017/dataset/user_view.txt", header=FALSE)
colnames(user_view) = c("user_id", "shop_id", "time_stamp")
user_view$time_stamp = as.POSIXct(user_view$time_stamp, format="%Y-%m-%d %H:%M:%S",tz=Sys.timezone())
view_stat = table(user_view$shop_id)
view_stat = data.frame(shop_id=as.numeric(names(view_stat)),freq=as.vector(view_stat))
nrow(view_stat) # 有三家店没有view记录。
## [1] 1997
view_shop_info = merge(shop_info, view_stat, by.x = "shop_id", by.y = "shop_id", all = TRUE)
view_shop_info[is.na(view_shop_info)] = 0
# 画图
ggplot(data=view_shop_info, aes(x=factor(score),y=freq, fill=factor(score)))+geom_violin()

ggplot(data=view_shop_info, aes(x=factor(score),y=log2(freq+1), fill=factor(score)))+geom_violin()

ggplot(data=view_shop_info[view_shop_info$score == 2,], aes(x=factor(per_pay),y=log2(freq+1)))+geom_violin()+ggtitle("per_pay freq plot for shops with 2 star")+coord_cartesian(ylim = c(6, 16))

ggplot(data=view_shop_info[view_shop_info$score == 4,], aes(x=factor(per_pay),y=log2(freq+1)))+geom_violin()+ggtitle("per_pay freq plot for shops with 4 star")+coord_cartesian(ylim = c(6, 16))

ggplot(data=view_shop_info, aes(x=factor(cate_1_name),y=per_pay))+
geom_violin()+ggtitle("一级品类名称")+
theme(text=element_text(family="STKaiti",size=14),axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data=view_shop_info, aes(x=factor(cate_2_name),y=per_pay))+
geom_violin()+ggtitle("二级分类名称")+
theme(text=element_text(family="STKaiti",size=14),axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data=view_shop_info, aes(x=factor(cate_3_name),y=per_pay))+
geom_violin()+ggtitle("三级分类名称")+
theme(text=element_text(family="STKaiti",size=14),axis.text.x = element_text(angle = 90, hjust = 1))

summary(view_stat$freq) # 得到平均的view数量
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40 469 1214 2783 3515 62660
# 得到view数量在平均附近的几家店
avg_shop = view_stat[abs(view_stat$freq - 2783) < 20,"shop_id"]
user_view$day_time = as.Date(user_view$time_stamp)
avg_view_time_series = data.frame(table(user_view[user_view$shop_id %in% avg_shop, c("day_time","shop_id")]))
ggplot(avg_view_time_series, aes(x=as.Date(day_time), y=Freq, col=shop_id)) + geom_line() + xlab("") + ylab("Daily Views")

view_time_series = data.frame(table(user_view[, c("day_time","shop_id")]))
ggplot(view_time_series, aes(log2(Freq+1))) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

查看用户pay的情况:
#哎呀数据好大
user_pay <- fread("~/tianchi2017/dataset/user_pay.txt", header=FALSE)
colnames(user_pay) = c("user_id", "shop_id", "time_stamp")
user_pay$time_stamp = as.Date(user_pay$time_stamp)
pay_time_series = data.frame(table(user_pay[, c("time_stamp","shop_id")]))
探索天气和支付的关系
all <- read.csv(file = '~/tianchi2017/dataset/example_weather.csv')
plot_ly(all, y = ~pay,x = ~weather, type = 'box')
## Warning: Ignoring 2 observations